期中專案(OLIST)資料探索

pacman::p_load(dplyr, ggplot2, readr, plotly, googleVis,ggthemes,grid,tidytext,wordcloud2)


load("data/olist.rdata")
load("data/Z.rdata")



【A】產品分類的銷售量是否會影響評分呢?

A<-left_join(P,TPC,by="product_category_name")
oisim<-I[,c(1,3)]
pando<-merge(A,oisim,by="product_id",all.x=T)
pandr<-merge(R,pando,by="order_id",all.x=T,all.y=T)
g = ggplot(segment, aes(x=totalRev, y=avgScore, col=business_segment)) +
  geom_point(aes(size=(avgItemsSold)))
ggplotly(g)
#1.使用ggplot套件繪圖,X軸為總營收,Y軸為平均分數,顏色則依照產品品類做分佈
#2.點狀圖大小為產品平均銷售量
#3.圖片中發現有outlier為Watches
segment1 <- segment[segment$totalRev<100000,] #排除掉watch品項(outlier)
g = ggplot(segment1, aes(x=totalRev, y=avgScore, col=business_segment)) +
  geom_point(aes(size=(avgItemsSold)))+
  geom_smooth(aes(x=totalRev, y=avgScore, col=business_segment),method = lm)
ggplotly(g)

🗿 商業分析:
  ■  排除掉watch(outlier)後,可以看到tatal Revenue和average score成正向關係。
  ■  可以看出評分以及總營收的分佈,並且可藉由平均銷售(泡泡大小)能了解市場大小。
  ■  可以發現在 health_beauty營收、評分最高,市場大小也不錯,games_consoles營收、評分最低,市場也小 。

💡 建議:
 新進賣家可以選擇從銷售量較高的產品開始販售,獲得的評分會較高。



【B】產品照片的數量是否會影響評論分數?

df01 = left_join(unique(I[,c(1,3)]), unique(left_join(O[,1,F], R[,1:3])[,-2])) %>%
  group_by(product_id) %>% summarise(
  noReview = n(),
  avgScore = mean(review_score)) %>%
  left_join(P[,c(1,5)])
Joining, by = "order_id"
Joining, by = "order_id"
Joining, by = "product_id"
df01 = df01[complete.cases(df01),]

cor(df01$avgScore, df01$product_photos_qty)
[1] 0.021683
cor.test(df01$avgScore, df01$product_photos_qty)

    Pearson's product-moment correlation

data:  df01$avgScore and df01$product_photos_qty
t = 3.9, df = 32300, p-value = 0.000096
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.010787 0.032574
sample estimates:
     cor 
0.021683 
#1.用product_id分組,算出其評論分數平均值
#2.算出評論分數與產品圖片數量之相關性
#3.算出評論分數與產品圖片數量之相關性
 #畫出產品圖片數量與平均分數之間的線性關係
ggplot(df01,aes(x = product_photos_qty, y = avgScore, col = product_photos_qty)) +
  scale_color_gradient(low = "#0DBF8C", high = "#CCCC4D") +
  stat_smooth(se=FALSE, col = "#FFA500", size=1) + 
  geom_hline(aes(yintercept=mean(avgScore)), col = "#66CDAA",linetype="dashed",size=1) +
  scale_x_continuous(breaks=c(1:20)) + theme_light() +
  ggtitle("product_photos_qty V.S Review Score")
`geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

🗿 商業分析:
  ■  對電子商務網站而言,商品展示是基本且十分重要的功能,且普遍認為「高品質的商品圖片」以及「適當的產品資訊量」,有助於提升消費者對於商品的好感度,本次分析商品圖片數量是否影響評論分數。

💡 建議:
 製作或拍攝產品圖片時,需注意圖片所呈現的資訊是否清楚。
 過多或過少的商品圖片數量,可能會造成商品資訊易讀性降低或是商品資訊不清。
 一般來說,3至9張商品圖片是最合適的圖片數量。



【C】產品名稱長度、描述長度是否會評論分數?

【C1】產品名稱長度與評論分數關係
df00 = left_join(
  unique(I[,c(1,3)]),
  unique(left_join(O[,1,F], R[,1:3])[,-2])
  )
Joining, by = "order_id"
Joining, by = "order_id"
df00 = group_by(df00, product_id) %>% summarise(
  avgScore = mean(review_score)
  ) %>% left_join(P[,c(1,3)])
Joining, by = "product_id"
df00 = df00[complete.cases(df00),]

ggplot(df00, aes(x = df00$product_name_lenght, y = df00$avgScore, col = product_name_lenght)) + 
  geom_point(shape = 1, size = 1, alpha = I(1/5)) + 
  scale_color_gradient(low = "#0DBF8C", high = "#CCCC4D") +
  stat_smooth(method = lm, se=F, col = "#FFA500", size=1) + 
  xlab("Product Name Lenght") + ylab("Review Score") + 
  scale_x_continuous(limits = c(5,76)) +
  theme_light() + ggtitle("Product Name Lenght V.S Review Score")

table(df00$product_name_lenght)

   5    6    7    8    9   10   11   12   13   14   15   16   17   18   19   20 
   2    1    2    2    8    5    7   13   16   11   28   35   37   51   72   79 
  21   22   23   24   25   26   27   28   29   30   31   32   33   34   35   36 
  73  108  127  134  171  171  213  246  276  321  368  350  439  434  440  522 
  37   38   39   40   41   42   43   44   45   46   47   48   49   50   51   52 
 507  541  607  644  681  709  755  711  800  881  890  900  987 1039 1018 1259 
  53   54   55   56   57   58   59   60   61   62   63   64   66   67   68   69 
1330 1439 1683 1675 1719 1887 2025 2182   65   65  515   59    1    1    1    1 
  72   76 
   1    1 
cor.test(df00$avgScore, df00$product_name_lenght)

    Pearson's product-moment correlation

data:  df00$avgScore and df00$product_name_lenght
t = -0.633, df = 32300, p-value = 0.53
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.0144199  0.0073772
sample estimates:
       cor 
-0.0035217 

🗿 商業分析:
  ■  畫圖,圖看起來多集中在右上角,但原因是因為產品名長度40-60的產品數量較多,且回歸線看起來幾乎是平線,兩者看起來並無關係。
  ■  接著用統計方法做測試,結果的確為不顯著。

💡 建議:
 產品名稱長度與產品的評分高低關係並不高,因此建議新進賣家在命名時不必太在意名稱長短,只要記得將產品重點寫妥即可。



【C2】產品描述長度與評論分數關係
df01 = left_join(
  unique(I[,c(1,3)]),
  unique(left_join(O[,1,F], R[,1:3])[,-2])
  )
Joining, by = "order_id"
Joining, by = "order_id"
df01 = group_by(df01, product_id) %>% summarise(
  avgScore = mean(review_score)
  ) %>% left_join(P[,c(1,4)])
Joining, by = "product_id"
df01 = df01[complete.cases(df01),]

ggplot(df01, aes(x = df01$product_description_lenght, y = df01$avgScore, col = product_description_lenght)) + 
  geom_point(shape = 1, size = 1, alpha = I(1/5)) + 
  scale_color_gradient(low = "#0DBF8C", high = "#CCCC4D") +
  stat_smooth(method = lm, se=F, col = "#FFA500", size=1) + 
  xlab("product_description_lenght") + ylab("Review Score") + 
  scale_x_continuous(limits = c(0,4000)) +
  theme_light() + ggtitle("product_description_lenght V.S Review Score")

head(sort(table(df01$product_description_lenght), decreasing = T), 1000)

 404  729  651  703  184  236  303  352  375  246  314  339  486  298  595  216 
  94   86   66   66   65   65   63   62   60   58   57   55   55   54   54   53 
 289  382  297  224  254  444  296  381  734  818  330  695  235  238  385  396 
  53   53   52   51   51   51   50   50   50   50   49   49   48   48   48   48 
 192  273  354  365  217  222  264  267  275  280  341  361  409  247  293  364 
  47   47   47   47   46   46   46   46   46   46   46   46   46   45   45   45 
 702  223  255  348  410  418  919  198  220  233  240  259  278  284  360  372 
  45   44   44   44   44   44   44   43   43   43   43   43   43   43   43   43 
 394  423  458  487  215  249  269  362  414  416  471  186  218  302  315  437 
  43   43   43   43   42   42   42   42   42   42   42   41   41   41   41   41 
 438  473  569  624  696  207  231  300  305  309  313  324  395  411  509  534 
  41   41   41   41   41   40   40   40   40   40   40   40   40   40   40   40 
 540  679  209  212  225  234  244  262  317  331  373  413  419  422  451  499 
  40   40   39   39   39   39   39   39   39   39   39   39   39   39   39   39 
 500  589  599  646  123  179  205  208  219  265  322  327  389  466  555  561 
  39   39   39   39   38   38   38   38   38   38   38   38   38   38   38   38 
 580  189  252  270  271  282  326  338  383  401  463  493  510  558  567  582 
  38   37   37   37   37   37   37   37   37   37   37   37   37   37   37   37 
 586  638  718  731  777 1440  160  162  175  226  232  250  288  325  329  347 
  37   37   37   37   37   37   36   36   36   36   36   36   36   36   36   36 
 374  408  478  484  609  676  688  138  228  320  332  349  391  402  427  435 
  36   36   36   36   36   36   36   35   35   35   35   35   35   35   35   35 
 455  459  485  492  527  530  537  546  623  626  655  663  701  182  188  202 
  35   35   35   35   35   35   35   35   35   35   35   35   35   34   34   34 
 210  214  248  258  261  274  283  344  350  386  443  446  462  464  498  525 
  34   34   34   34   34   34   34   34   34   34   34   34   34   34   34   34 
 549  556  614  627  648  678  776  144  171  183  211  257  272  276  279  285 
  34   34   34   34   34   34   34   33   33   33   33   33   33   33   33   33 
 292  343  346  370  388  430  431  472  476  489  513  575  635  687  728  111 
  33   33   33   33   33   33   33   33   33   33   33   33   33   33   33   32 
 124  130  190  194  199  221  245  290  291  308  310  328  340  424  452  453 
  32   32   32   32   32   32   32   32   32   32   32   32   32   32   32   32 
 465  497  502  517  519  551  557  566  568  594  617  629  640  664  677  680 
  32   32   32   32   32   32   32   32   32   32   32   32   32   32   32   32 
 867  157  204  227  287  321  355  357  379  406  407  412  421  449  518  538 
  32   31   31   31   31   31   31   31   31   31   31   31   31   31   31   31 
 554  574  598  602  622  660  704  721  752  156  167  180  196  230  241  242 
  31   31   31   31   31   31   31   31   31   30   30   30   30   30   30   30 
 251  286  306  336  377  380  392  398  400  417  425  428  442  470  501  503 
  30   30   30   30   30   30   30   30   30   30   30   30   30   30   30   30 
 506  507  548  563  572  573  587  600  604  616  666  684  700  716  918 1156 
  30   30   30   30   30   30   30   30   30   30   30   30   30   30   30   30 
 147  268  301  304  312  318  335  342  345  353  359  390  393  403  405  440 
  29   29   29   29   29   29   29   29   29   29   29   29   29   29   29   29 
 508  512  515  516  528  576  581  603  608  610  636  641  683  770  789  803 
  29   29   29   29   29   29   29   29   29   29   29   29   29   29   29   29 
 886  126  140  153  169  187  243  253  334  351  371  456  461  474  477  523 
  29   28   28   28   28   28   28   28   28   28   28   28   28   28   28   28 
 532  553  559  565  578  591  593  597  601  605  606  665  675  740  750  121 
  28   28   28   28   28   28   28   28   28   28   28   28   28   28   28   27 
 150  200  294  295  316  369  432  434  439  450  457  488  529  531  545  547 
  27   27   27   27   27   27   27   27   27   27   27   27   27   27   27   27 
 562  571  632  661  670  673  706  709  864   94  125  197  237  260  281  311 
  27   27   27   27   27   27   27   27   27   26   26   26   26   26   26   26 
 363  366  415  445  447  475  479  490  495  520  524  535  585  596  607  631 
  26   26   26   26   26   26   26   26   26   26   26   26   26   26   26   26 
 647  669  672  686  723  751  805  815  840  868 1101  139  148  166  174  181 
  26   26   26   26   26   26   26   26   26   26   26   25   25   25   25   25 
 201  206  229  256  323  337  356  358  367  397  468  480  494  504  505  521 
  25   25   25   25   25   25   25   25   25   25   25   25   25   25   25   25 
 542  543  560  564  570  612  613  615  642  650  786  844  905  114  117  159 
  25   25   25   25   25   25   25   25   25   25   25   25   25   24   24   24 
 203  213  263  441  460  469  482  511  544  630  649  657  662  707  708  712 
  24   24   24   24   24   24   24   24   24   24   24   24   24   24   24   24 
 713  726  747  791  802  816  852  870  940  133  145  146  149  155  161  163 
  24   24   24   24   24   24   24   24   24   23   23   23   23   23   23   23 
 165  177  178  193  266  277  319  376  399  420  433  491  588  634  637  668 
  23   23   23   23   23   23   23   23   23   23   23   23   23   23   23   23 
 671  681  691  693  719  722  727  749  757  783  801  817  831  882  890  893 
  23   23   23   23   23   23   23   23   23   23   23   23   23   23   23   23 
 897  909  920  956 1750  115  128  132  137  143  158  168  170  299  378  384 
  23   23   23   23   23   22   22   22   22   22   22   22   22   22   22   22 
 429  483  526  539  577  644  652  653  654  715  738  744  766  796  799  807 
  22   22   22   22   22   22   22   22   22   22   22   22   22   22   22   22 
 811  821  829  863  891   84   87   92  113  127  141  152  164  172  185  307 
  22   22   22   22   22   21   21   21   21   21   21   21   21   21   21   21 
 333  426  436  448  454  552  584  590  618  621  659  690  705  733  736  762 
  21   21   21   21   21   21   21   21   21   21   21   21   21   21   21   21 
 763  808  813  833  851  862  872  875  880  925  998 2010  116  119  151  191 
  21   21   21   21   21   21   21   21   21   21   21   21   20   20   20   20 
 387  496  536  541  592  674  685  692  697  699  710  711  739  754  790  837 
  20   20   20   20   20   20   20   20   20   20   20   20   20   20   20   20 
 856  865  866  871  874  896  906  921  955 1223 1359 1749  120  481  514  633 
  20   20   20   20   20   20   20   20   20   20   20   20   19   19   19   19 
 645  724  725  732  735  737  743  748  767  781  784  792  812  830  853  855 
  19   19   19   19   19   19   19   19   19   19   19   19   19   19   19   19 
 869  899  903  927  951  989 1006 1235 1311  154  522  579  583  619  620  628 
  19   19   19   19   19   19   19   19   19   18   18   18   18   18   18   18 
 698  730  764  779  787  826  828  832  850  915  916  922  923  929  937  971 
  18   18   18   18   18   18   18   18   18   18   18   18   18   18   18   18 
 993 1002 1009 1057 1058 1076 1405   89  100  108  112  122  135  176  195  533 
  18   18   18   18   18   18   18   17   17   17   17   17   17   17   17   17 
 643  682  720  742  756  761  769  785  798  800  806  814  824  825  827  877 
  17   17   17   17   17   17   17   17   17   17   17   17   17   17   17   17 
 878  892  902  904  913  941  978  982 1047 1146 1151 1177   95  106  131  134 
  17   17   17   17   17   17   17   17   17   17   17   17   16   16   16   16 
 142  467  639  658  689  694  746  772  795  797  804  809  823  835  838  841 
  16   16   16   16   16   16   16   16   16   16   16   16   16   16   16   16 
 849  854  883  900  910  933  934  963  964 1023 1043 1064 1069 1078 1099 1209 
  16   16   16   16   16   16   16   16   16   16   16   16   16   16   16   16 
1212 1236 1290 1392   81   90  101  102  239  667  717  745  758  765  771  780 
  16   16   16   16   15   15   15   15   15   15   15   15   15   15   15   15 
 793  794  858  859  873  879  885  908  914  943  946  953  960  991 1003 1031 
  15   15   15   15   15   15   15   15   15   15   15   15   15   15   15   15 
1054 1080 1132 1145 1186 1271 1313 1322 1343 1400 1649   85  104  118  129  136 
  15   15   15   15   15   15   15   15   15   15   15   14   14   14   14   14 
 173  368  550  611  656  741  755  759  760  768  782  810  819  820  834  836 
  14   14   14   14   14   14   14   14   14   14   14   14   14   14   14   14 
 847  901  917  926  928  931  935  950  952  954  961  966  979  983 1026 1066 
  14   14   14   14   14   14   14   14   14   14   14   14   14   14   14   14 
1081 1096 1102 1130 1140 1143 1162 1168 1200 1360 1451 1541 2043   74  753  775 
  14   14   14   14   14   14   14   14   14   14   14   14   14   13   13   13 
 845  846  857  884  888  895  907  912  962  967  969  974  975  985 1000 1001 
  13   13   13   13   13   13   13   13   13   13   13   13   13   13   13   13 
1005 1007 1016 1018 1042 1053 1059 1087 1107 1112 1214 1219 1416 1988 2083   82 
  13   13   13   13   13   13   13   13   13   13   13   13   13   13   13   12 
  93   96   99  105  625  773  774  822  842  860  876  894  898  924  944  945 
  12   12   12   12   12   12   12   12   12   12   12   12   12   12   12   12 
 948  959  973  999 1048 1050 1074 1085 1095 1104 1105 1113 1123 1125 1135 1157 
  12   12   12   12   12   12   12   12   12   12   12   12   12   12   12   12 
1170 1182 1193 1198 1202 1220 1241 1246 1283 1892 1953 1969 1987   58   68   71 
  12   12   12   12   12   12   12   12   12   12   12   12   12   11   11   11 
  73  110  778  839  848  861  939  965  968  981  992  996 1024 1028 1033 1034 
  11   11   11   11   11   11   11   11   11   11   11   11   11   11   11   11 
1038 1045 1049 1056 1071 1072 1077 1084 
  11   11   11   11   11   11   11   11 
cor.test(df01$avgScore, df01$product_description_lenght)

    Pearson's product-moment correlation

data:  df01$avgScore and df01$product_description_lenght
t = 4.89, df = 32300, p-value = 0.000001
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.016307 0.038088
sample estimates:
     cor 
0.027201 

🗿 商業分析:
  ■  畫圖,圖看起來多集中在左上角,但原因是因為產品描述長度1000以下的產品數量較多,但回歸線看起來有點正相關的趨勢。
  ■  因此用統計方法做測試,結果為顯著。

💡 建議:
 產品描述長度與評分高低是呈現正相關,因此建議新進賣家可盡量將產品描述豐富詳實,以提高自身產品評價。



【D】商品到貨天數與評分分數的關係?

df = group_by(R, order_id) %>% summarise(score = mean(review_score))
df = merge(O, df, by='order_id', all.x=T) 
df <- na.omit(df)
df = mutate(df, duration=as.numeric(difftime(
  order_delivered_customer_date, order_approved_at, units="days"))) 
cor.test(df$score, df$duration) 

    Pearson's product-moment correlation

data:  df$score and df$duration
t = -110, df = 96500, p-value <0.0000000000000002
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.34065 -0.32944
sample estimates:
     cor 
-0.33506 
table(df$order_status) 

 canceled delivered 
        6     96455 
df_canceled <- df %>% filter(order_status == "canceled") 
#1.計算貨運間隔時間與評分相關性檢定
#2.以order id為分組,算review score的平均數
#3.將order dataset與df用order id合併
#4.計算時間間隔,命名為duration,新增到df中
#5.皮爾森相關檢定,結果顯著
#6.篩出取消之訂單,有6筆訂單有送達,但最後取消,可能是已送達卻未取件(確定一下custumer_id有無重複)
#7.刪掉取消的6筆訂單
summary(df)
   order_id         customer_id        order_status      
 Length:96461       Length:96461       Length:96461      
 Class :character   Class :character   Class :character  
 Mode  :character   Mode  :character   Mode  :character  
                                                         
                                                         
                                                         
 order_purchase_timestamp      order_approved_at            
 Min.   :2016-09-15 12:16:38   Min.   :2016-09-15 12:16:38  
 1st Qu.:2017-09-14 09:28:28   1st Qu.:2017-09-14 14:30:14  
 Median :2018-01-20 19:59:42   Median :2018-01-22 13:49:00  
 Mean   :2018-01-01 23:53:26   Mean   :2018-01-02 10:10:06  
 3rd Qu.:2018-05-05 18:33:24   3rd Qu.:2018-05-06 10:30:49  
 Max.   :2018-08-29 15:00:37   Max.   :2018-08-29 15:10:26  
 order_delivered_carrier_date  order_delivered_customer_date
 Min.   :2016-10-08 10:34:01   Min.   :2016-10-11 13:46:32  
 1st Qu.:2017-09-18 16:52:19   1st Qu.:2017-09-25 22:31:59  
 Median :2018-01-24 16:19:03   Median :2018-02-02 19:50:56  
 Mean   :2018-01-05 05:21:04   Mean   :2018-01-14 13:17:13  
 3rd Qu.:2018-05-08 14:33:00   3rd Qu.:2018-05-15 23:08:54  
 Max.   :2018-09-11 19:48:28   Max.   :2018-10-17 13:22:46  
 order_estimated_delivery_date     score         duration     
 Min.   :2016-10-04 00:00:00   Min.   :1.00   Min.   : -6.99  
 1st Qu.:2017-10-05 00:00:00   1st Qu.:4.00   1st Qu.:  6.31  
 Median :2018-02-16 00:00:00   Median :5.00   Median :  9.85  
 Mean   :2018-01-25 17:33:14   Mean   :4.14   Mean   : 12.13  
 3rd Qu.:2018-05-28 00:00:00   3rd Qu.:5.00   3rd Qu.: 15.14  
 Max.   :2018-10-25 00:00:00   Max.   :5.00   Max.   :208.50  
df = filter(df,df$duration >0)
summary(df)
   order_id         customer_id        order_status      
 Length:96400       Length:96400       Length:96400      
 Class :character   Class :character   Class :character  
 Mode  :character   Mode  :character   Mode  :character  
                                                         
                                                         
                                                         
 order_purchase_timestamp      order_approved_at            
 Min.   :2016-09-15 12:16:38   Min.   :2016-09-15 12:16:38  
 1st Qu.:2017-09-14 09:06:45   1st Qu.:2017-09-14 13:30:21  
 Median :2018-01-20 18:22:49   Median :2018-01-22 13:45:02  
 Mean   :2018-01-01 22:10:15   Mean   :2018-01-02 08:22:00  
 3rd Qu.:2018-05-05 17:26:10   3rd Qu.:2018-05-06 00:58:29  
 Max.   :2018-08-29 15:00:37   Max.   :2018-08-29 15:10:26  
 order_delivered_carrier_date  order_delivered_customer_date
 Min.   :2016-10-08 10:34:01   Min.   :2016-10-11 13:46:32  
 1st Qu.:2017-09-18 16:46:13   1st Qu.:2017-09-25 22:18:27  
 Median :2018-01-24 15:26:45   Median :2018-02-02 18:55:39  
 Mean   :2018-01-05 03:38:43   Mean   :2018-01-14 11:41:39  
 3rd Qu.:2018-05-08 14:21:00   3rd Qu.:2018-05-15 22:03:05  
 Max.   :2018-09-11 19:48:28   Max.   :2018-10-17 13:22:46  
 order_estimated_delivery_date     score         duration      
 Min.   :2016-10-04 00:00:00   Min.   :1.00   Min.   :  0.008  
 1st Qu.:2017-10-05 00:00:00   1st Qu.:4.00   1st Qu.:  6.315  
 Median :2018-02-16 00:00:00   Median :5.00   Median :  9.857  
 Mean   :2018-01-25 15:58:40   Mean   :4.14   Mean   : 12.139  
 3rd Qu.:2018-05-28 00:00:00   3rd Qu.:5.00   3rd Qu.: 15.145  
 Max.   :2018-10-25 00:00:00   Max.   :5.00   Max.   :208.501  
sd(df$duration) 
[1] 9.5126
library(EnvStats)

Attaching package: 'EnvStats'
The following objects are masked from 'package:stats':

    predict, predict.lm
The following object is masked from 'package:base':

    print.default
rosnerTest(df$duration)

Results of Outlier Test
-------------------------

Test Method:                     Rosner's Test for Outliers

Hypothesized Distribution:       Normal

Data:                            df$duration

Sample Size:                     96400

Test Statistics:                 R.1 = 20.642
                                 R.2 = 20.671
                                 R.3 = 19.293

Test Statistic Parameter:        k = 3

Alternative Hypothesis:          Up to 3 observations are not
                                 from the same Distribution.

Type I Error:                    5%

Number of Outliers Detected:     3

  i Mean.i   SD.i  Value Obs.Num  R.i+1 lambda.i+1 Outlier
1 0 12.139 9.5126 208.50   76095 20.642      5.019    TRUE
2 1 12.137 9.4916 208.34   10277 20.671      5.019    TRUE
3 2 12.135 9.4706 194.85   17876 19.293      5.019    TRUE
bench <- 15 + 1.5*IQR(df$duration)
bench #28.5
[1] 28.245
df_outlier = df %>% filter(df$duration > 28.5)#讓df篩出偏離值
df = anti_join(df, df_outlier, by = "order_id")
which.max(df$duration) #確定偏離值已篩出
[1] 11041
df$duration[629] #確定最大值仍小於28.5
[1] 28.192
#1.發現duration有負值,篩出duration > 0 者
#summary(df) #to check negative value are filtered.
#sd = 9.5126
#篩出偏離值
#計算偏離值
#新的df資料描述
summary(df)#mean score = 4.24
   order_id         customer_id        order_status      
 Length:91387       Length:91387       Length:91387      
 Class :character   Class :character   Class :character  
 Mode  :character   Mode  :character   Mode  :character  
                                                         
                                                         
                                                         
 order_purchase_timestamp      order_approved_at            
 Min.   :2016-10-03 09:44:50   Min.   :2016-10-04 09:43:32  
 1st Qu.:2017-09-11 20:18:58   1st Qu.:2017-09-12 04:50:39  
 Median :2018-01-20 09:38:35   Median :2018-01-20 15:41:24  
 Mean   :2018-01-02 10:36:16   Mean   :2018-01-02 20:46:55  
 3rd Qu.:2018-05-08 21:50:49   3rd Qu.:2018-05-09 09:43:32  
 Max.   :2018-08-29 15:00:37   Max.   :2018-08-29 15:10:26  
 order_delivered_carrier_date  order_delivered_customer_date
 Min.   :2016-10-08 10:34:01   Min.   :2016-10-11 13:46:32  
 1st Qu.:2017-09-14 13:58:53   1st Qu.:2017-09-21 17:36:09  
 Median :2018-01-23 21:52:51   Median :2018-01-31 22:28:39  
 Mean   :2018-01-05 10:51:35   Mean   :2018-01-13 11:23:04  
 3rd Qu.:2018-05-10 16:27:00   3rd Qu.:2018-05-17 20:18:50  
 Max.   :2018-08-31 15:25:00   Max.   :2018-09-11 00:37:34  
 order_estimated_delivery_date     score         duration      
 Min.   :2016-10-27 00:00:00   Min.   :1.00   Min.   : 0.0077  
 1st Qu.:2017-10-03 00:00:00   1st Qu.:4.00   1st Qu.: 6.1535  
 Median :2018-02-15 00:00:00   Median :5.00   Median : 9.3555  
 Mean   :2018-01-25 20:15:07   Mean   :4.24   Mean   :10.6084  
 3rd Qu.:2018-05-30 00:00:00   3rd Qu.:5.00   3rd Qu.:14.0860  
 Max.   :2018-10-25 00:00:00   Max.   :5.00   Max.   :28.4995  
             #mean duration = 10.2,median = 9。
sd(df$duration) #df = 6
[1] 6.012
#試算正一個標準差後的分數
df_1sd = df %>% filter(df$duration >= 15)
mean(df_1sd$score) #一個標準差後的平均分數為3.9
[1] 3.9186
#計算眾數
df_00 = cbind(df$order_id,df$duration) %>% data.frame()
df_00$X2 = as.integer(as.character(df_00$X2))
df$duration = as.integer(as.character(df$duration))
#mode 01
names(table(df_00$X2))[table(df_00$X2)==max(table(df_00$X2))] #7
[1] "7"
#計算眾數的平均數
library(dplyr)
df_modescore = filter(df,df$duration >= 7 & df$duration < 8) 
mean(df_modescore$score) #4.3542
[1] 4.3542
score2 <- round(df$score, digits = 0)
df <-cbind(df, score2)
df = df[,-12]

boxplot(formula=duration~score2,
        data = df, xlab="score2", ylab = "duration", col ="lightblue")

🗿 商業分析:
  ■  1.由相關檢定發現,(不論篩過偏離值與否)貨運時間間隔與滿意度評分有顯著相關,意即顧客等待時間越久,整體的滿意度會越低。
  ■  2.(mode = 7)大部分時間間隔在7日左右,平均評分落在4.4分,高於平均4.24,代表多數時候,olist的訂單及貨運處理流程是令人滿意的。
  ■  3.olist的出貨品質(到貨天數)是不穩定的嗎,標準差為6日左右,接近平均送達天數的一倍。而到貨天數不穩定會直接影響評論分數。

💡 建議:
 標準差為6日左右,代表olist貨運間隔時間並不穩定,然此將直接攸關滿意度評分,因此建議olist從訂單處理、備貨、出送貨流程優化下手,以獲得更好的顧客回饋。